Cluster Regions

Code
library(dplyr) ; library(tidyr) ; library(ggplot2)

root <- rprojroot::has_file(".git/index")
datadir = root$find_file("data")
funsdir = root$find_file("functions")
savingdir = root$find_file("saved_files")

files_vec <- list.files(funsdir)

for( i in 1:length(files_vec)){
  source(root$find_file(paste0(funsdir,'/',files_vec[i])))
}

# dat_tax = data.table::fread('https://raw.githubusercontent.com/rafaelcatoia/zoop_16N/main/treated_taxonomy_dat.csv') %>%
#   as_tibble()
# 
# ### Loading the new data 
# grump_version = 'new'
# 
# if(grump_version=='new'){
# ## Use this if you are using the new GRUMP Data Set
#   datapath = root$find_file(paste0(datadir,'/','grump_asv_long20240501.csv'))
# dframe = data.table::fread(input = datapath) %>%
#   filter(Cruise %in% c('P16N','P16S')) %>% 
#   #filter(Raw.Sequence.Counts>0) %>% 
#   filter(Domain!='Unassigned') %>% 
#   mutate(Raw.Sequence.Counts = Corrected_sequence_counts)
# 
# }else{
#   ## Or this one if you are using the OLD GRUMP Data Set
#   datapath = root$find_file(paste0(datadir,'/','grump_asv_long_20240110.csv'))
#   dframe = data.table::fread(input = datapath) %>%
#     filter(Cruise %in% c('P16N','P16S')) %>% 
#     #filter(Raw.Sequence.Counts>0) %>% 
#     filter(Domain!='Unassigned')
# }
# 
# dat_tax %>% select(starts_with('P16')) %>% dim()

Here we used two different methods of clustering, and used a geodist matrix to induce geographic coherence

Code
df_evaluation = readRDS(paste0(savingdir,'/','df_evaluation'))

df_summary = bind_rows(
  df_evaluation %>% 
  group_by(ncluster,method,alpha,DistMetric) %>% 
  summarise_all(.funs = mean) %>% mutate(summary_metric='Mean') %>% ungroup() %>% 
  pivot_longer(cols = -c(ncluster,method,alpha,DistMetric,summary_metric),names_to = 'Metric'),
  
  df_evaluation %>% 
  group_by(ncluster,method,alpha,DistMetric) %>% 
  summarise_all(.funs = sd) %>% mutate(summary_metric='SD') %>% ungroup() %>% 
  pivot_longer(cols = -c(ncluster,method,alpha,DistMetric,summary_metric),names_to = 'Metric')
)

df_summary %>% filter(Metric=='within_sum') %>% select(-Metric) %>% 
  pivot_wider(id_cols = ncluster:DistMetric,names_from = summary_metric ) %>%
  mutate(alpha=factor(alpha)) %>% 
  ggplot(aes(x=ncluster,y=Mean,color=method,fill=method,linetype=alpha))+
  geom_line()+
  geom_ribbon(aes(ymin=Mean-SD,ymax=Mean+SD),alpha=0.25)+
  facet_grid(DistMetric~alpha,scales = 'free')+
  theme_minimal()+
  theme(legend.position = 'bottom')

Code
df_summary %>% filter(Metric=='avg_max_dist_within') %>% select(-Metric) %>% 
  pivot_wider(id_cols = ncluster:DistMetric,names_from = summary_metric ) %>%
  mutate(alpha=factor(alpha)) %>% 
  ggplot(aes(x=ncluster,y=Mean,color=method,fill=method,linetype=alpha))+
  geom_line()+
  geom_ribbon(aes(ymin=Mean-SD,ymax=Mean+SD),alpha=0.25)+
  facet_grid(DistMetric~alpha,scales = 'free')+
  theme_minimal()+
  theme(legend.position = 'bottom')

Code
df_evaluation_lat_mirrored = readRDS(paste0(savingdir,'/','df_evaluation_lat_mirrored'))

df_summary = bind_rows(
  df_evaluation_lat_mirrored %>% 
  group_by(ncluster,method,alpha,DistMetric) %>% 
  summarise_all(.funs = mean) %>% mutate(summary_metric='Mean') %>% ungroup() %>% 
  pivot_longer(cols = -c(ncluster,method,alpha,DistMetric,summary_metric),names_to = 'Metric'),
  
  df_evaluation_lat_mirrored %>% 
  group_by(ncluster,method,alpha,DistMetric) %>% 
  summarise_all(.funs = sd) %>% mutate(summary_metric='SD') %>% ungroup() %>% 
  pivot_longer(cols = -c(ncluster,method,alpha,DistMetric,summary_metric),names_to = 'Metric')
)

df_summary %>% filter(Metric=='within_sum') %>% select(-Metric) %>% 
  pivot_wider(id_cols = ncluster:DistMetric,names_from = summary_metric ) %>%
  mutate(alpha=factor(alpha)) %>% 
  ggplot(aes(x=ncluster,y=Mean,color=method,fill=method,linetype=alpha))+
  geom_line()+
  geom_ribbon(aes(ymin=Mean-SD,ymax=Mean+SD),alpha=0.25)+
  facet_grid(DistMetric~alpha,scales = 'free')+
  theme_minimal()+
  theme(legend.position = 'bottom')

Code
df_summary %>% filter(Metric=='avg_max_dist_within') %>% select(-Metric) %>% 
  pivot_wider(id_cols = ncluster:DistMetric,names_from = summary_metric ) %>%
  mutate(alpha=factor(alpha)) %>% 
  ggplot(aes(x=ncluster,y=Mean,color=method,fill=method,linetype=alpha))+
  geom_line()+
  geom_ribbon(aes(ymin=Mean-SD,ymax=Mean+SD),alpha=0.25)+
  facet_grid(DistMetric~alpha,scales = 'free')+
  theme_minimal()+
  theme(legend.position = 'bottom')

Code
#list_cluster_membership_and_bounderies = readRDS(file = paste0(savingdir,'/','list_cluster_membership_and_bounderies'))
#grid_base = readRDS(file = paste0(savingdir,'/','grid_base'))
#clust_member_limits = unlist_coloring_obj(list_cluster_membership_and_bounderies)
### This may take a while
#plt1 = gen_plots(gridBase = grid_base,clustMemLim = clust_member_limits,df_GeoAbio = df_geo_abiotics)
#saveRDS(plt1,paste0(savingdir,'/','plt1'))
plt1 = readRDS(paste0(savingdir,'/','plt1'))

plt1$limits_faceted

Code
plt1$clustRegion_facetd

Code
#list_cluster_membership_and_bounderies_mirroredLat = readRDS(file = paste0(savingdir,'/','list_cluster_membership_and_bounderies_mirroredLat'))
#clust_member_limits = unlist_coloring_obj(list_cluster_membership_and_bounderies_mirroredLat)
#plt2 = gen_plots(gridBase = grid_base,clustMemLim = clust_member_limits,df_GeoAbio = df_geo_abiotics)
#saveRDS(plt2,paste0(savingdir,'/','plt2'))
plt2 = readRDS(paste0(savingdir,'/','plt2'))

plt2$limits_faceted

Code
plt2$clustRegion_facetd